#====================================== Description ==========================================
#This script computes Tajima's D and Fu & Li's F statistics for each paired-sites alignment,
#and simulates 100 alignments from the corresponding unpaired-sites alignment, computes their 
#Tajima's D and Fu & Li's F statistics which are compared to those of paired-sites statistics 
#to obtain the p-value.

#==================================== Importing the modules ==================================
from Bio import SeqIO
import os
import egglib
import random
random.seed(1)

#===================================== Defining functions ====================================
#This function takes as input a list of files in fasta format and convert them to phylip format.
def ConvPhyToFasta(file_names_list):
    fasta_files_saved=[]
    for i in range(len(file_names_list)):
        print file_names_list[i]
        conv_name=file_names_list[i].split('.')[0]
        name_of_file=conv_name+'.fas'
        fasta_files_saved.append(name_of_file)
        print file_names_list[i], name_of_file
        count=SeqIO.convert(file_names_list[i],'phylip', name_of_file, "fasta")
    return fasta_files_saved

#This function converts all phylip files located in the current directory to fasta format.	
def ConvAllPhy2Fasta():
    phy_files=[]
    f=os.listdir(".")
    for i in f:
       if i.endswith(".phy"):
            phy_files.append(i)   
    ConvPhyToFasta(phy_files)

#This function gets all fasta files that consist of the paired-sites within HCSS. These were named
#using "paired" prefix and have ".fas" extension.
def GetPairedFas():
	fas_files=[]
	f=os.listdir(".")
	for i in f:
		if i.startswith("paired") and i.endswith(".fas"):
			fas_files.append(i)  
	return fas_files
	
#Given a paired-sites alignment and its corresponding unpaired-sites alignment, this function
#simulates a new unpaired-sites alignment by randomly choosing columns from the unpaired-sites,
#such that the simulated alignment has the same number of sites as the paired-sites alignment.
def SimulateAlign(paired_in, unpaired_in):	
	p_rec=[i for i in SeqIO.parse(open(paired_in,"r"),"fasta")]
	u_rec=[i for i in SeqIO.parse(open(unpaired_in,"r"),"fasta")]
	sim_arr=[""]*len(p_rec)
	
	for i in range(len(p_rec[0].seq)):
		c=random.randint(0,len(u_rec[0].seq)-1)
		for j in range(len(p_rec)):
			sim_arr[j]=sim_arr[j]+u_rec[j].seq[c]
	hx=open("sim.fas","w")
	counter=1
	for i in range(len(sim_arr)):
		hx.write(">SeqSim"+str(counter)+"\n")
		hx.write(sim_arr[i]+"\n")
		counter=counter+1
	hx.close()
	
#======================================= main program ============================================
#ConvAllPhy2Fasta() This is used when the input files are in phylib format
paired_fas=GetPairedFas()

#This for loop iterates through a list of paired-sites alignment datasets, computes their Tajima and Fu & 
#Fi statistics, simulates 100 alignments of unpaired-sites and computes their statistics which are  
#compared to those of paired-sites alignment. Finally it writes all simulations results and final p-value.
for p_fas in paired_fas:
	#Computing the Tajima's D and Fu and Li of the paired-sites alignment
	u_fas="un"+p_fas
	print "---------- "+p_fas+" -- "+u_fas+" --------------"
	t=open(p_fas+"stats.txt","w")
	p_align=egglib.Align(p_fas)
	p_sum_stats=egglib.Align.polymorphismBPP(p_align)
	p_Tajima_D =p_sum_stats["Deta"] #Fajima's D statistic
	p_Fu_Li_F =p_sum_stats["Fstar"] #Fu and Li's statistic
	print "Paired-site Tajim's D and Fu & Li's F : "+str(p_Tajima_D)+"\t"+str(p_Fu_Li_F)
	print "Simulated unpaired-site Tajima'D and Fu & Li's F"
	c_Taj=0
	c_Fu=0
	TotTaj=0
	TotFu=0
	SimLen=100
	taj=""
	fu=""
	#Simulations of 100 unpaired-sites alignments, and computation of their Tajima and Fu & Li statistics
	for x in range(SimLen):
		SimulateAlign(p_fas,u_fas)
		u_align=egglib.Align("sim.fas")
		u_sum_stats=egglib.Align.polymorphismBPP(u_align)
		u_Tajima_D =u_sum_stats["Deta"]
		u_Fu_Li_F =u_sum_stats["Fstar"]
		
		TotFu=TotFu+u_Fu_Li_F
		TotTaj=TotTaj+u_Tajima_D
		
		if p_Tajima_D > u_Tajima_D :
			c_Taj=c_Taj+1
			
		if p_Fu_Li_F > u_Fu_Li_F:
			c_Fu=c_Fu+1
		print str(x)+"%"+" : "+str(u_Tajima_D)+" "+str(u_Fu_Li_F)
		taj=taj+" "+str(u_Tajima_D)
		fu=fu+" "+str(u_Fu_Li_F)
		os.remove("sim.fas")
	#Writing all simulation results and final p-value
	t.write(str(p_Tajima_D)+"\n")
	t.write(taj+"\n")
	t.write(str(p_Fu_Li_F)+"\n")
	t.write(fu+"\n")
	t.write("name\tTajima's D\tunp Av Taj\tP-val\tFu_Li_F\tunp Av Fu\tp-val Fu\n")
	t.write(p_fas+"\t"+str(p_Tajima_D)+"\t"+str(float(TotTaj)/SimLen)+"\t"+ str(float(c_Taj)/SimLen)+"\t"+str(p_Fu_Li_F)+"\t"+str(float(TotFu)/SimLen)+"\t"+str(float(c_Fu)/SimLen)+"\n")
	print ("name\tTajima's D\tunp Av Taj\tP-val\tFu_Li_F\tunp Av Fu\tp-val Fu\n")
	print p_fas+"\t"+str(p_Tajima_D)+"\t"+str(float(TotTaj)/SimLen)+"\t"+ str(float(c_Taj)/SimLen)+"\t"+str(p_Fu_Li_F)+"\t"+str(float(TotFu)/SimLen)+"\t"+str(float(c_Fu)/SimLen)
	t.close()

print "done"

